package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.select.Elements;

public class Caijinwang extends BreadthCrawler {
    private String seedurl = "http://www.caijing.com.cn/";
    // http://.*.caijing.com.cn/2022.*.shtml
    private String regurl = "http://.*.caijing.com.cn/2022.*.shtml";

    public Caijinwang(String crawlPath) {
        super(crawlPath, false);
        addSeed(seedurl);
        setThreads(1);
        //  setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        if (page.matchType("text")&&page.select("div.sub_lt>span.news_name").text().contains("财经网")) {
            String url = page.url();
            String source="财经网";
            String writer="财经网";
            String time=page.select("div.sub_lt>span.news_time").text();
          //  page.select("div.sub_lt>span.news_name").text();
            String title = page.select("div.article>h2").text();
            String text=page.select("div.article-content>p").text();
            System.out.println(url);
            System.out.println(title);
            System.out.println(time);
            System.out.println(text);

        } else {
            Elements elements = page.select("a");
            for (int i = 0; i < elements.size(); i++) {
                if (elements.get(i).attr("abs:href").matches(regurl)) {
                    crawlDatums.add(new CrawlDatum(elements.get(i).attr("abs:href"), "text"));
                }
            }
        }

    }

    public static void main(String[] args) throws Exception {
        Caijinwang cai = new Caijinwang("caijingwang");
        cai.start(2);
    }
}
